#include "line2Dup.h"
#include <iostream>
#include "fusion.h"
#include"omp.h"
using namespace std;
using namespace cv;

namespace line2Dup
{
	/**
	 * \brief Get the label [0,8) of the single bit set in quantized.
	 */
	static inline int getLabel(int quantized)
	{
		switch (quantized)
		{
		case 1:
			return 0;
		case 2:
			return 1;
		case 4:
			return 2;
		case 8:
			return 3;
		case 16:
			return 4;
		case 32:
			return 5;
		case 64:
			return 6;
		case 128:
			return 7;
		default:
			CV_Error(Error::StsBadArg, "Invalid value of quantized parameter");
			return -1; //avoid warning
		}
	}

	void Feature::read(const FileNode& fn)
	{
		FileNodeIterator fni = fn.begin();
		fni >> x >> y >> label;
	}

	void Feature::write(FileStorage& fs) const
	{
		fs << "[:" << x << y << label << "]";
	}

	void Template::read(const FileNode& fn)
	{
		width = fn["width"];
		height = fn["height"];
		tl_x = fn["tl_x"];
		tl_y = fn["tl_y"];
		pyramid_level = fn["pyramid_level"];

		FileNode features_fn = fn["features"];
		features.resize(features_fn.size());
		FileNodeIterator it = features_fn.begin(), it_end = features_fn.end();
		for (int i = 0; it != it_end; ++it, ++i)
		{
			features[i].read(*it);
		}
	}

	void Template::write(FileStorage& fs) const
	{
		fs << "width" << width;
		fs << "height" << height;
		fs << "tl_x" << tl_x;
		fs << "tl_y" << tl_y;
		fs << "pyramid_level" << pyramid_level;

		fs << "features"
			<< "[";
		for (int i = 0; i < (int)features.size(); ++i)
		{
			features[i].write(fs);
		}
		fs << "]"; // features
	}

	static Rect cropTemplates(std::vector<Template>& templates)
	{
		int min_x = std::numeric_limits<int>::max();
		int min_y = std::numeric_limits<int>::max();
		int max_x = std::numeric_limits<int>::min();
		int max_y = std::numeric_limits<int>::min();

		// First pass: find min/max feature x,y over all pyramid levels and modalities
		for (int i = 0; i < (int)templates.size(); ++i)
		{
			Template& templ = templates[i];

			for (int j = 0; j < (int)templ.features.size(); ++j)
			{
				int x = templ.features[j].x << templ.pyramid_level;
				int y = templ.features[j].y << templ.pyramid_level;
				min_x = std::min(min_x, x);
				min_y = std::min(min_y, y);
				max_x = std::max(max_x, x);
				max_y = std::max(max_y, y);
			}
		}

		/// @todo Why require even min_x, min_y?
		if (min_x % 2 == 1)
			--min_x;
		if (min_y % 2 == 1)
			--min_y;

		// Second pass: set width/height and shift all feature positions
		for (int i = 0; i < (int)templates.size(); ++i)
		{
			Template& templ = templates[i];
			templ.width = (max_x - min_x) >> templ.pyramid_level;
			templ.height = (max_y - min_y) >> templ.pyramid_level;
			templ.tl_x = min_x >> templ.pyramid_level;
			templ.tl_y = min_y >> templ.pyramid_level;

			for (int j = 0; j < (int)templ.features.size(); ++j)
			{
				templ.features[j].x -= templ.tl_x;
				templ.features[j].y -= templ.tl_y;
			}
		}

		return Rect(min_x, min_y, max_x - min_x, max_y - min_y);
	}

	bool ColorGradientPyramid::selectScatteredFeatures(const std::vector<Candidate>& candidates,
		std::vector<Feature>& features,
		size_t num_features, float distance)
	{
		features.clear();
		float distance_sq = distance * distance;
		int i = 0;

		bool first_select = true;

		while (true)
		{
			Candidate c = candidates[i];

			// Add if sufficient distance away from any previously chosen feature
			bool keep = true;
			for (int j = 0; (j < (int)features.size()) && keep; ++j)
			{
				Feature f = features[j];
				keep = (c.f.x - f.x) * (c.f.x - f.x) + (c.f.y - f.y) * (c.f.y - f.y) >= distance_sq;
			}
			if (keep)
				features.push_back(c.f);

			if (++i == (int)candidates.size()) {
				bool num_ok = features.size() >= num_features;

				if (first_select) {
					if (num_ok) {
						features.clear(); // we don't want too many first time
						i = 0;
						distance += 1.0f;
						distance_sq = distance * distance;
						continue;
					}
					else {
						first_select = false;
					}
				}

				// Start back at beginning, and relax required distance
				i = 0;
				distance -= 1.0f;
				distance_sq = distance * distance;
				if (num_ok || distance < 3) {
					break;
				}
			}
		}
		return true;
	}

	/****************************************************************************************\
	*                                                         Color gradient ColorGradient                                                                        *
	\****************************************************************************************/

	void hysteresisGradient(Mat& magnitude, Mat& quantized_angle,
		Mat& angle, float threshold)
	{
		// Quantize 360 degree range of orientations into 16 buckets
		// Note that [0, 11.25), [348.75, 360) both get mapped in the end to label 0,
		// for stability of horizontal and vertical features.
		Mat_<unsigned char> quantized_unfiltered;
		angle.convertTo(quantized_unfiltered, CV_8U, 16.0 / 360.0);

		// Zero out top and bottom rows
		/// @todo is this necessary, or even correct?
		memset(quantized_unfiltered.ptr(), 0, quantized_unfiltered.cols);
		memset(quantized_unfiltered.ptr(quantized_unfiltered.rows - 1), 0, quantized_unfiltered.cols);
		// Zero out first and last columns
		for (int r = 0; r < quantized_unfiltered.rows; ++r)
		{
			quantized_unfiltered(r, 0) = 0;
			quantized_unfiltered(r, quantized_unfiltered.cols - 1) = 0;
		}

		// Mask 16 buckets into 8 quantized orientations
		for (int r = 1; r < angle.rows - 1; ++r)
		{
			uchar* quant_r = quantized_unfiltered.ptr<uchar>(r);
			for (int c = 1; c < angle.cols - 1; ++c)
			{
				quant_r[c] &= 7;
			}
		}

		// Filter the raw quantized image. Only accept pixels where the magnitude is above some
		// threshold, and there is local agreement on the quantization.
		quantized_angle = Mat::zeros(angle.size(), CV_8U);
		for (int r = 1; r < angle.rows - 1; ++r)
		{
			float* mag_r = magnitude.ptr<float>(r);

			for (int c = 1; c < angle.cols - 1; ++c)
			{
				if (mag_r[c] > threshold)
				{
					// Compute histogram of quantized bins in 3x3 patch around pixel
					int histogram[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };

					uchar* patch3x3_row = &quantized_unfiltered(r - 1, c - 1);
					histogram[patch3x3_row[0]]++;
					histogram[patch3x3_row[1]]++;
					histogram[patch3x3_row[2]]++;

					patch3x3_row += quantized_unfiltered.step1();
					histogram[patch3x3_row[0]]++;
					histogram[patch3x3_row[1]]++;
					histogram[patch3x3_row[2]]++;

					patch3x3_row += quantized_unfiltered.step1();
					histogram[patch3x3_row[0]]++;
					histogram[patch3x3_row[1]]++;
					histogram[patch3x3_row[2]]++;

					// Find bin with the most votes from the patch
					int max_votes = 0;
					int index = -1;
					for (int i = 0; i < 8; ++i)
					{
						if (max_votes < histogram[i])
						{
							index = i;
							max_votes = histogram[i];
						}
					}

					// Only accept the quantization if majority of pixels in the patch agree
					static const int NEIGHBOR_THRESHOLD = 5;
					if (max_votes >= NEIGHBOR_THRESHOLD)
						quantized_angle.at<uchar>(r, c) = uchar(1 << index);
				}
			}
		}
	}

	static void quantizedOrientations(const Mat& src, Mat& magnitude,
		Mat& angle, Mat& angle_ori, float threshold)
	{
		Mat smoothed;
		// Compute horizontal and vertical image derivatives on all color channels separately
		static const int KERNEL_SIZE = 3;
		// For some reason cvSmooth/cv::GaussianBlur, cvSobel/cv::Sobel have different defaults for border handling...
		GaussianBlur(src, smoothed, Size(KERNEL_SIZE, KERNEL_SIZE), 0, 0, BORDER_REPLICATE);

		if (src.channels() == 1) {
			Mat sobel_dx, sobel_dy, sobel_ag;
			Sobel(smoothed, sobel_dx, CV_32F, 1, 0, 3, 1.0, 0.0, BORDER_REPLICATE);
			Sobel(smoothed, sobel_dy, CV_32F, 0, 1, 3, 1.0, 0.0, BORDER_REPLICATE);
			magnitude = sobel_dx.mul(sobel_dx) + sobel_dy.mul(sobel_dy);
			phase(sobel_dx, sobel_dy, sobel_ag, true);
			hysteresisGradient(magnitude, angle, sobel_ag, threshold * threshold);
			angle_ori = sobel_ag;

		}
		else {

			magnitude.create(src.size(), CV_32F);

			// Allocate temporary buffers
			Size size = src.size();
			Mat sobel_3dx;              // per-channel horizontal derivative
			Mat sobel_3dy;              // per-channel vertical derivative
			Mat sobel_dx(size, CV_32F); // maximum horizontal derivative
			Mat sobel_dy(size, CV_32F); // maximum vertical derivative
			Mat sobel_ag;               // final gradient orientation (unquantized)

			Sobel(smoothed, sobel_3dx, CV_16S, 1, 0, 3, 1.0, 0.0, BORDER_REPLICATE);
			Sobel(smoothed, sobel_3dy, CV_16S, 0, 1, 3, 1.0, 0.0, BORDER_REPLICATE);

			short* ptrx = (short*)sobel_3dx.data;
			short* ptry = (short*)sobel_3dy.data;
			float* ptr0x = (float*)sobel_dx.data;
			float* ptr0y = (float*)sobel_dy.data;
			float* ptrmg = (float*)magnitude.data;

			const int length1 = static_cast<const int>(sobel_3dx.step1());
			const int length2 = static_cast<const int>(sobel_3dy.step1());
			const int length3 = static_cast<const int>(sobel_dx.step1());
			const int length4 = static_cast<const int>(sobel_dy.step1());
			const int length5 = static_cast<const int>(magnitude.step1());
			const int length0 = sobel_3dy.cols * 3;

			for (int r = 0; r < sobel_3dy.rows; ++r)
			{
				int ind = 0;

				for (int i = 0; i < length0; i += 3)
				{
					// Use the gradient orientation of the channel whose magnitude is largest
					int mag1 = ptrx[i + 0] * ptrx[i + 0] + ptry[i + 0] * ptry[i + 0];
					int mag2 = ptrx[i + 1] * ptrx[i + 1] + ptry[i + 1] * ptry[i + 1];
					int mag3 = ptrx[i + 2] * ptrx[i + 2] + ptry[i + 2] * ptry[i + 2];

					if (mag1 >= mag2 && mag1 >= mag3)
					{
						ptr0x[ind] = ptrx[i];
						ptr0y[ind] = ptry[i];
						ptrmg[ind] = (float)mag1;
					}
					else if (mag2 >= mag1 && mag2 >= mag3)
					{
						ptr0x[ind] = ptrx[i + 1];
						ptr0y[ind] = ptry[i + 1];
						ptrmg[ind] = (float)mag2;
					}
					else
					{
						ptr0x[ind] = ptrx[i + 2];
						ptr0y[ind] = ptry[i + 2];
						ptrmg[ind] = (float)mag3;
					}
					++ind;
				}
				ptrx += length1;
				ptry += length2;
				ptr0x += length3;
				ptr0y += length4;
				ptrmg += length5;
			}

			// Calculate the final gradient orientations
			phase(sobel_dx, sobel_dy, sobel_ag, true);
			hysteresisGradient(magnitude, angle, sobel_ag, threshold * threshold);
			angle_ori = sobel_ag;
		}


	}

	ColorGradientPyramid::ColorGradientPyramid(const Mat& _src, const Mat& _mask,
		float _weak_threshold, size_t _num_features,
		float _strong_threshold)
		: src(_src),
		mask(_mask),
		pyramid_level(0),
		weak_threshold(_weak_threshold),
		num_features(_num_features),
		strong_threshold(_strong_threshold)
	{
		update();
	}

	void ColorGradientPyramid::update()
	{
		quantizedOrientations(src, magnitude, angle, angle_ori, weak_threshold);
	}

	void ColorGradientPyramid::pyrDown()
	{
		// Some parameters need to be adjusted
		num_features /= 2; /// @todo Why not 4?
		++pyramid_level;

		// Downsample the current inputs
		Size size(src.cols / 2, src.rows / 2);
		Mat next_src;
		cv::pyrDown(src, next_src, size);
		src = next_src;

		if (!mask.empty())
		{
			Mat next_mask;
			resize(mask, next_mask, size, 0.0, 0.0, INTER_NEAREST);
			mask = next_mask;
		}

		update();
	}

	void ColorGradientPyramid::quantize(Mat& dst) const
	{
		dst = Mat::zeros(angle.size(), CV_8U);
		angle.copyTo(dst, mask);
	}

	bool ColorGradientPyramid::extractTemplate(Template& templ) const
	{
		// Want features on the border to distinguish from background
		Mat local_mask;
		if (!mask.empty())
		{
			erode(mask, local_mask, Mat(), Point(-1, -1), 1, BORDER_REPLICATE);
			//        subtract(mask, local_mask, local_mask);
		}

		std::vector<Candidate> candidates;
		bool no_mask = local_mask.empty();
		float threshold_sq = strong_threshold * strong_threshold;

		int nms_kernel_size = 3;
		cv::Mat magnitude_valid = cv::Mat(magnitude.size(), CV_8UC1, cv::Scalar(255));

		for (int r = 0 + nms_kernel_size / 2; r < magnitude.rows - nms_kernel_size / 2; ++r)
		{
			const uchar* mask_r = no_mask ? NULL : local_mask.ptr<uchar>(r);

			for (int c = 0 + nms_kernel_size / 2; c < magnitude.cols - nms_kernel_size / 2; ++c)
			{
				if (no_mask || mask_r[c])
				{
					float score = 0;
					if (magnitude_valid.at<uchar>(r, c) > 0) {
						score = magnitude.at<float>(r, c);
						bool is_max = true;
						for (int r_offset = -nms_kernel_size / 2; r_offset <= nms_kernel_size / 2; r_offset++) {
							for (int c_offset = -nms_kernel_size / 2; c_offset <= nms_kernel_size / 2; c_offset++) {
								if (r_offset == 0 && c_offset == 0) continue;

								if (score < magnitude.at<float>(r + r_offset, c + c_offset)) {
									score = 0;
									is_max = false;
									break;
								}
							}
							if (!is_max) break;
						}

						if (is_max) {
							for (int r_offset = -nms_kernel_size / 2; r_offset <= nms_kernel_size / 2; r_offset++) {
								for (int c_offset = -nms_kernel_size / 2; c_offset <= nms_kernel_size / 2; c_offset++) {
									if (r_offset == 0 && c_offset == 0) continue;
									magnitude_valid.at<uchar>(r + r_offset, c + c_offset) = 0;
								}
							}
						}
					}

					if (score > threshold_sq && angle.at<uchar>(r, c) > 0)
					{
						candidates.push_back(Candidate(c, r, getLabel(angle.at<uchar>(r, c)), score));
						candidates.back().f.theta = angle_ori.at<float>(r, c);
					}
				}
			}
		}
		// We require a certain number of features
		if (candidates.size() < num_features) {
			if (candidates.size() <= 4) {
				//std::cout << "too few features, abort" << std::endl;

				//
				return false;
			}
			// std::cout << "have no enough features, exaustive mode" << std::endl;
		}

		// NOTE: Stable sort to agree with old code, which used std::list::sort()
		std::stable_sort(candidates.begin(), candidates.end());

		// Use heuristic based on surplus of candidates in narrow outline for initial distance threshold
		float distance = static_cast<float>(candidates.size() / num_features + 1);

		// selectScatteredFeatures always return true
		if (!selectScatteredFeatures(candidates, templ.features, num_features, distance))
		{
			return false;
		}

		// Size determined externally, needs to match templates for other modalities
		templ.width = -1;
		templ.height = -1;
		templ.pyramid_level = pyramid_level;

		return true;
	}

	ColorGradient::ColorGradient()
		: weak_threshold(30.0f),
		num_features(63),
		strong_threshold(60.0f)
	{
	}

	ColorGradient::ColorGradient(float _weak_threshold, size_t _num_features, float _strong_threshold)
		: weak_threshold(_weak_threshold),
		num_features(_num_features),
		strong_threshold(_strong_threshold)
	{
	}

	static const char CG_NAME[] = "ColorGradient";

	std::string ColorGradient::name() const
	{
		return CG_NAME;
	}

	void ColorGradient::read(const FileNode& fn)
	{
		String type = fn["type"];
		CV_Assert(type == CG_NAME);

		weak_threshold = fn["weak_threshold"];
		num_features = int(fn["num_features"]);
		strong_threshold = fn["strong_threshold"];
	}

	void ColorGradient::write(FileStorage& fs) const
	{
		fs << "type" << CG_NAME;
		fs << "weak_threshold" << weak_threshold;
		fs << "num_features" << int(num_features);
		fs << "strong_threshold" << strong_threshold;
	}
	/****************************************************************************************\
	*                                                                 Response maps                                                                                    *
	\****************************************************************************************/

	static void orUnaligned8u(const uchar* src, const int src_stride,
		uchar* dst, const int dst_stride,
		const int width, const int height)
	{
		for (int r = 0; r < height; ++r)
		{
			int c = 0;

			// not aligned, which will happen because we move 1 bytes a time for spreading
			while (reinterpret_cast<unsigned long long>(src + c) % 16 != 0) {
				dst[c] |= src[c];
				c++;
			}

			// avoid out of bound when can't divid
			// note: can't use c<width !!!
			for (; c <= width - mipp::N<uint8_t>(); c += mipp::N<uint8_t>()) {
				mipp::Reg<uint8_t> src_v((uint8_t*)src + c);
				mipp::Reg<uint8_t> dst_v((uint8_t*)dst + c);

				mipp::Reg<uint8_t> res_v = mipp::orb(src_v, dst_v);
				res_v.store((uint8_t*)dst + c);
			}

			for (; c < width; c++)
				dst[c] |= src[c];

			// Advance to next row
			src += src_stride;
			dst += dst_stride;
		}
	}

	static void spread(const Mat& src, Mat& dst, int T)
	{
		// Allocate and zero-initialize spread (OR'ed) image
		dst = Mat::zeros(src.size(), CV_8U);

		// Fill in spread gradient image (section 2.3)
		for (int r = 0; r < T; ++r)
		{
			for (int c = 0; c < T; ++c)
			{
				orUnaligned8u(&src.at<unsigned char>(r, c), static_cast<const int>(src.step1()), dst.ptr(),
					static_cast<const int>(dst.step1()), src.cols - c, src.rows - r);
			}
		}
	}

	static const unsigned char LUT3 = 3;
	// 1,2-->0 3-->LUT3
	CV_DECL_ALIGNED(16)
		static const unsigned char SIMILARITY_LUT[256] = { 0, 4, LUT3, 4, 0, 4, LUT3, 4, 0, 4, LUT3, 4, 0, 4, LUT3, 4, 0, 0, 0, 0, 0, 0, 0, 0, LUT3, LUT3, LUT3, LUT3, LUT3, LUT3, LUT3, LUT3, 0, LUT3, 4, 4, LUT3, LUT3, 4, 4, 0, LUT3, 4, 4, LUT3, LUT3, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, LUT3, LUT3, 4, 4, 4, 4, LUT3, LUT3, LUT3, LUT3, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, LUT3, LUT3, LUT3, LUT3, 4, 4, 4, 4, 4, 4, 4, 4, 0, LUT3, 0, LUT3, 0, LUT3, 0, LUT3, 0, LUT3, 0, LUT3, 0, LUT3, 0, LUT3, 0, 0, 0, 0, 0, 0, 0, 0, LUT3, LUT3, LUT3, LUT3, LUT3, LUT3, LUT3, LUT3, 0, 4, LUT3, 4, 0, 4, LUT3, 4, 0, 4, LUT3, 4, 0, 4, LUT3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, LUT3, 4, 4, LUT3, LUT3, 4, 4, 0, LUT3, 4, 4, LUT3, LUT3, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, LUT3, LUT3, 4, 4, 4, 4, LUT3, LUT3, LUT3, LUT3, 4, 4, 4, 4, 0, LUT3, 0, LUT3, 0, LUT3, 0, LUT3, 0, LUT3, 0, LUT3, 0, LUT3, 0, LUT3, 0, 0, 0, 0, LUT3, LUT3, LUT3, LUT3, 4, 4, 4, 4, 4, 4, 4, 4 };
	//static const unsigned char SIMILARITY_LUT[256] = { 0 };

	static void computeResponseMaps(const Mat& src, std::vector<Mat>& response_maps)
	{

		CV_Assert((src.rows * src.cols) % 16 == 0);

		// Allocate response maps
		response_maps.resize(8);
		for (int i = 0; i < 8; ++i)
			response_maps[i].create(src.size(), CV_8U);

		Mat lsb4(src.size(), CV_8U);
		Mat msb4(src.size(), CV_8U);

		for (int r = 0; r < src.rows; ++r)
		{
			const uchar* src_r = src.ptr(r);
			uchar* lsb4_r = lsb4.ptr(r);
			uchar* msb4_r = msb4.ptr(r);

			for (int c = 0; c < src.cols; ++c)
			{
				// Least significant 4 bits of spread image pixel
				lsb4_r[c] = src_r[c] & 15;
				// Most significant 4 bits, right-shifted to be in [0, 16)
				msb4_r[c] = (src_r[c] & 240) >> 4;
			}
		}

		{
			uchar* lsb4_data = lsb4.ptr<uchar>();
			uchar* msb4_data = msb4.ptr<uchar>();

			bool no_max = true;
			bool no_shuff = true;

#ifdef has_max_int8_t
			no_max = false;
#endif

#ifdef has_shuff_int8_t
			no_shuff = false;
#endif
			// LUT is designed for 128 bits SIMD, so quite triky for others

			// For each of the 8 quantized orientations...
			for (int ori = 0; ori < 8; ++ori) {
				uchar* map_data = response_maps[ori].ptr<uchar>();
				const uchar* lut_low = SIMILARITY_LUT + 32 * ori;

				if (mipp::N<uint8_t>() == 1 || no_max || no_shuff) { // no SIMD
					for (int i = 0; i < src.rows * src.cols; ++i)
						map_data[i] = std::max(lut_low[lsb4_data[i]], lut_low[msb4_data[i] + 16]);
				}
				else if (mipp::N<uint8_t>() == 16) { // 128 SIMD, no add base

					const uchar* lut_low = SIMILARITY_LUT + 32 * ori;
					mipp::Reg<uint8_t> lut_low_v((uint8_t*)lut_low);
					mipp::Reg<uint8_t> lut_high_v((uint8_t*)lut_low + 16);

					for (int i = 0; i < src.rows * src.cols; i += mipp::N<uint8_t>()) {
						mipp::Reg<uint8_t> low_mask((uint8_t*)lsb4_data + i);
						mipp::Reg<uint8_t> high_mask((uint8_t*)msb4_data + i);

						mipp::Reg<uint8_t> low_res = mipp::shuff(lut_low_v, low_mask);
						mipp::Reg<uint8_t> high_res = mipp::shuff(lut_high_v, high_mask);

						mipp::Reg<uint8_t> result = mipp::max(low_res, high_res);
						result.store((uint8_t*)map_data + i);
					}
				}
				else if (mipp::N<uint8_t>() == 16 || mipp::N<uint8_t>() == 32
					|| mipp::N<uint8_t>() == 64) { //128 256 512 SIMD
					CV_Assert((src.rows * src.cols) % mipp::N<uint8_t>() == 0);

					uint8_t lut_temp[mipp::N<uint8_t>()] = { 0 };

					for (int slice = 0; slice < mipp::N<uint8_t>() / 16; slice++) {
						std::copy_n(lut_low, 16, lut_temp + slice * 16);
					}
					mipp::Reg<uint8_t> lut_low_v(lut_temp);

					uint8_t base_add_array[mipp::N<uint8_t>()] = { 0 };
					for (uint8_t slice = 0; slice < mipp::N<uint8_t>(); slice += 16) {
						std::copy_n(lut_low + 16, 16, lut_temp + slice);
						std::fill_n(base_add_array + slice, 16, slice);
					}
					mipp::Reg<uint8_t> base_add(base_add_array);
					mipp::Reg<uint8_t> lut_high_v(lut_temp);

					for (int i = 0; i < src.rows * src.cols; i += mipp::N<uint8_t>()) {
						mipp::Reg<uint8_t> mask_low_v((uint8_t*)lsb4_data + i);
						mipp::Reg<uint8_t> mask_high_v((uint8_t*)msb4_data + i);

						mask_low_v += base_add;
						mask_high_v += base_add;

						mipp::Reg<uint8_t> shuff_low_result = mipp::shuff(lut_low_v, mask_low_v);
						mipp::Reg<uint8_t> shuff_high_result = mipp::shuff(lut_high_v, mask_high_v);

						mipp::Reg<uint8_t> result = mipp::max(shuff_low_result, shuff_high_result);
						result.store((uint8_t*)map_data + i);
					}
				}
				else {
					for (int i = 0; i < src.rows * src.cols; ++i)
						map_data[i] = std::max(lut_low[lsb4_data[i]], lut_low[msb4_data[i] + 16]);
				}
			}


		}
	}

	static void linearize(const Mat& response_map, Mat& linearized, int T)
	{
		CV_Assert(response_map.rows % T == 0);
		CV_Assert(response_map.cols % T == 0);

		// linearized has T^2 rows, where each row is a linear memory
		int mem_width = response_map.cols / T;
		int mem_height = response_map.rows / T;
		linearized.create(T * T, mem_width * mem_height, CV_8U);

		// Outer two for loops iterate over top-left T^2 starting pixels
		int index = 0;
		for (int r_start = 0; r_start < T; ++r_start)
		{
			for (int c_start = 0; c_start < T; ++c_start)
			{
				uchar* memory = linearized.ptr(index);
				++index;

				// Inner two loops copy every T-th pixel into the linear memory
				for (int r = r_start; r < response_map.rows; r += T)
				{
					const uchar* response_data = response_map.ptr(r);
					for (int c = c_start; c < response_map.cols; c += T)
						*memory++ = response_data[c];
				}
			}
		}
	}
	/****************************************************************************************\
	*                                                             Linearized similarities                                                                    *
	\****************************************************************************************/
	static const unsigned char* accessLinearMemory(const std::vector<Mat>& linear_memories,
		const Feature& f, int T, int W)
	{
		auto start = std::chrono::high_resolution_clock::now();

		// Retrieve the TxT grid of linear memories associated with the feature label
		const Mat& memory_grid = linear_memories[f.label];
		CV_DbgAssert(memory_grid.rows == T * T);
		CV_DbgAssert(f.x >= 0);
		CV_DbgAssert(f.y >= 0);
		// The LM we want is at (x%T, y%T) in the TxT grid (stored as the rows of memory_grid)
		int grid_x = f.x % T;
		int grid_y = f.y % T;
		int grid_index = grid_y * T + grid_x;
		CV_DbgAssert(grid_index >= 0);
		CV_DbgAssert(grid_index < memory_grid.rows);
		const unsigned char* memory = memory_grid.ptr(grid_index);
		// Within the LM, the feature is at (x/T, y/T). W is the "width" of the LM, the
		// input image width decimated by T.
		int lm_x = f.x / T;
		int lm_y = f.y / T;
		int lm_index = lm_y * W + lm_x;
		CV_DbgAssert(lm_index >= 0);
		CV_DbgAssert(lm_index < memory_grid.cols);

		auto end = std::chrono::high_resolution_clock::now(); // 结束时间
		auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
		return memory + lm_index;
	}
	static void similarity(const std::vector<Mat>& linear_memories, const Template& templ,
		Mat& dst, Size size, int T)
	{
		// we only have one modality, so 8192*2, due to mipp, back to 8192
		CV_Assert(templ.features.size() < 8192);

		// Decimate input image size by factor of T
		int W = size.width / T;
		int H = size.height / T;

		// Feature dimensions, decimated by factor T and rounded up
		int wf = (templ.width - 1) / T + 1;
		int hf = (templ.height - 1) / T + 1;

		// Span is the range over which we can shift the template around the input image
		int span_x = W - wf;
		int span_y = H - hf;

		int template_positions = span_y * W + span_x + 1; // why add 1?

		dst = Mat::zeros(H, W, CV_16U);
		short* dst_ptr = dst.ptr<short>();

		mipp::Reg<uint8_t> zero_v(uint8_t(0));
		for (int i = 0; i < (int)templ.features.size(); ++i)
		{
			Feature f = templ.features[i];

			if (f.x < 0 || f.x >= size.width || f.y < 0 || f.y >= size.height)
				continue;
			const uchar* lm_ptr = accessLinearMemory(linear_memories, f, T, W);
			int j = 0;

			// *2 to avoid int8 read out of range

			//不使用mipp
			for (; j <= template_positions - mipp::N<int16_t>() * 2; j += mipp::N<int16_t>()) {
				mipp::Reg<uint8_t> src8_v((uint8_t*)lm_ptr + j);
				// uchar to short, once for N bytes
				mipp::Reg<int16_t> src16_v(mipp::interleavelo(src8_v, zero_v).r);
				mipp::Reg<int16_t> dst_v((int16_t*)dst_ptr + j);
				mipp::Reg<int16_t> res_v = src16_v + dst_v;
				res_v.store((int16_t*)dst_ptr + j);
			}

			for (; j < template_positions; j++)
			{
				//break;
				//lm_ptr[j]中有每个特征点的评分
				dst_ptr[j] += short(lm_ptr[j]);
			}
		}
	}

	static void similarityLocal(const std::vector<Mat>& linear_memories, const Template& templ,
		Mat& dst, Size size, int T, Point center)
	{

		CV_Assert(templ.features.size() < 8192);

		int W = size.width / T;
		dst = Mat::zeros(16, 16, CV_16U);

		int offset_x = (center.x / T - 8) * T;
		int offset_y = (center.y / T - 8) * T;
		mipp::Reg<uint8_t> zero_v = uint8_t(0);

		for (int i = 0; i < (int)templ.features.size(); ++i)
		{
			Feature f = templ.features[i];
			f.x += offset_x;
			f.y += offset_y;
			// Discard feature if out of bounds, possibly due to applying the offset
			if (f.x < 0 || f.y < 0 || f.x >= size.width || f.y >= size.height)
				continue;

			const uchar* lm_ptr = accessLinearMemory(linear_memories, f, T, W);
			{
				short* dst_ptr = dst.ptr<short>();

				if (mipp::N<uint8_t>() > 32) { //512 bits SIMD
					for (int row = 0; row < 16; row += mipp::N<int16_t>() / 16) {
						mipp::Reg<int16_t> dst_v((int16_t*)dst_ptr + row * 16);

						// load lm_ptr, 16 bytes once, for half
						uint8_t local_v[mipp::N<uint8_t>()] = { 0 };
						for (int slice = 0; slice < mipp::N<uint8_t>() / 16 / 2; slice++) {
							std::copy_n(lm_ptr, 16, &local_v[16 * slice]);
							lm_ptr += W;
						}
						mipp::Reg<uint8_t> src8_v(local_v);
						// uchar to short, once for N bytes
						mipp::Reg<int16_t> src16_v(mipp::interleavelo(src8_v, zero_v).r);

						mipp::Reg<int16_t> res_v = src16_v + dst_v;
						res_v.store((int16_t*)dst_ptr);

						dst_ptr += mipp::N<int16_t>();
					}
				}
				else { // 256 128 or no SIMD
					for (int row = 0; row < 16; ++row) {
						for (int col = 0; col < 16; col += mipp::N<int16_t>()) {
							mipp::Reg<uint8_t> src8_v((uint8_t*)lm_ptr + col);

							// uchar to short, once for N bytes
							mipp::Reg<int16_t> src16_v(mipp::interleavelo(src8_v, zero_v).r);

							mipp::Reg<int16_t> dst_v((int16_t*)dst_ptr + col);
							mipp::Reg<int16_t> res_v = src16_v + dst_v;
							res_v.store((int16_t*)dst_ptr + col);
						}
						dst_ptr += 16;
						lm_ptr += W;
					}
				}
			}
		}
	}

	static void similarity_64(const std::vector<Mat>& linear_memories, const Template& templ,
		Mat& dst, Size size, int T)
	{
		// 63 features or less is a special case because the max similarity per-feature is 4.
		// 255/4 = 63, so up to that many we can add up similarities in 8 bits without worrying
		// about overflow. Therefore here we use _mm_add_epi8 as the workhorse, whereas a more
		// general function would use _mm_add_epi16.
		CV_Assert(templ.features.size() < 64);
		/// @todo Handle more than 255/MAX_RESPONSE features!!

		// Decimate input image size by factor of T
		int W = size.width / T;
		int H = size.height / T;

		// Feature dimensions, decimated by factor T and rounded up
		int wf = (templ.width - 1) / T + 1;
		int hf = (templ.height - 1) / T + 1;

		// Span is the range over which we can shift the template around the input image
		int span_x = W - wf;
		int span_y = H - hf;

		// Compute number of contiguous (in memory) pixels to check when sliding feature over
		// image. This allows template to wrap around left/right border incorrectly, so any
		// wrapped template matches must be filtered out!
		int template_positions = span_y * W + span_x + 1; // why add 1?
		//int template_positions = (span_y - 1) * W + span_x; // More correct?

		/// @todo In old code, dst is buffer of size m_U. Could make it something like
		/// (span_x)x(span_y) instead?
		dst = Mat::zeros(H, W, CV_8U);
		uchar* dst_ptr = dst.ptr<uchar>();

		// Compute the similarity measure for this template by accumulating the contribution of
		// each feature
	  //  std::vector<int>feature_vector(template_positions,0);
		for (int i = 0; i < (int)templ.features.size(); ++i)
		{
			// Add the linear memory at the appropriate offset computed from the location of
			// the feature in the template
			Feature f = templ.features[i];
			// Discard feature if out of bounds
			/// @todo Shouldn't actually see x or y < 0 here?
			if (f.x < 0 || f.x >= size.width || f.y < 0 || f.y >= size.height)
				continue;
			const uchar* lm_ptr = accessLinearMemory(linear_memories, f, T, W);

			// Now we do an aligned/unaligned add of dst_ptr and lm_ptr with template_positions elements
			int j = 0;

			for (; j <= template_positions - mipp::N<uint8_t>(); j += mipp::N<uint8_t>()) {

				mipp::Reg<uint8_t> src_v((uint8_t*)lm_ptr + j);
				mipp::Reg<uint8_t> dst_v((uint8_t*)dst_ptr + j);

				mipp::Reg<uint8_t> res_v = src_v + dst_v;
				res_v.store((uint8_t*)dst_ptr + j);
			}

			for (; j < template_positions; j++)
			{

				dst_ptr[j] += lm_ptr[j];
			}
		}
	}

	static void similarityLocal_64(const std::vector<Mat>& linear_memories, const Template& templ,
		Mat& dst, Size size, int T, Point center)
	{
		// Similar to whole-image similarity() above. This version takes a position 'center'
		// and computes the energy in the 16x16 patch centered on it.
		CV_Assert(templ.features.size() < 64);

		// Compute the similarity map in a 16x16 patch around center
		int W = size.width / T;
		dst = Mat::zeros(16, 16, CV_8U);

		// Offset each feature point by the requested center. Further adjust to (-8,-8) from the
		// center to get the top-left corner of the 16x16 patch.
		// NOTE: We make the offsets multiples of T to agree with results of the original code.
		int offset_x = (center.x / T - 8) * T;
		int offset_y = (center.y / T - 8) * T;

		for (int i = 0; i < (int)templ.features.size(); ++i)
		{
			Feature f = templ.features[i];
			f.x += offset_x;
			f.y += offset_y;
			// Discard feature if out of bounds, possibly due to applying the offset
			if (f.x < 0 || f.y < 0 || f.x >= size.width || f.y >= size.height)
				continue;

			const uchar* lm_ptr = accessLinearMemory(linear_memories, f, T, W);

			{
				uchar* dst_ptr = dst.ptr<uchar>();

				if (mipp::N<uint8_t>() > 16) { // 256 or 512 bits SIMD
					for (int row = 0; row < 16; row += mipp::N<uint8_t>() / 16) {
						mipp::Reg<uint8_t> dst_v((uint8_t*)dst_ptr);

						// load lm_ptr, 16 bytes once
						uint8_t local_v[mipp::N<uint8_t>()];
						for (int slice = 0; slice < mipp::N<uint8_t>() / 16; slice++) {
							std::copy_n(lm_ptr, 16, &local_v[16 * slice]);
							lm_ptr += W;
						}
						mipp::Reg<uint8_t> src_v(local_v);

						mipp::Reg<uint8_t> res_v = src_v + dst_v;
						res_v.store((uint8_t*)dst_ptr);

						dst_ptr += mipp::N<uint8_t>();
					}
				}
				else { // 128 or no SIMD
					for (int row = 0; row < 16; ++row) {
						for (int col = 0; col < 16; col += mipp::N<uint8_t>()) {
							mipp::Reg<uint8_t> src_v((uint8_t*)lm_ptr + col);
							mipp::Reg<uint8_t> dst_v((uint8_t*)dst_ptr + col);
							mipp::Reg<uint8_t> res_v = src_v + dst_v;
							res_v.store((uint8_t*)dst_ptr + col);
						}
						dst_ptr += 16;
						lm_ptr += W;
					}
				}
			}
		}
	}

	/****************************************************************************************\
	*                                                             High-level Detector API                                                                    *
	\****************************************************************************************/

	Detector::Detector()
	{
		this->modality = makePtr<ColorGradient>();
		pyramid_levels = 2;
		T_at_level.push_back(4);
		T_at_level.push_back(8);
	}

	Detector::Detector(std::vector<int> T)
	{
		this->modality = makePtr<ColorGradient>();
		pyramid_levels = T.size();
		T_at_level = T;
	}

	Detector::Detector(int num_features, std::vector<int> T, float weak_thresh, float strong_thresh) :weak_thresh(weak_thresh), strong_thresh(strong_thresh)
	{
		this->modality = makePtr<ColorGradient>(weak_thresh, num_features, strong_thresh);
		pyramid_levels = T.size();
		T_at_level = T;
		res_map_mag_thresh = strong_thresh;
	}

	static int gcd(int a, int b) {
		if (a == 0)
			return b;
		return gcd(b % a, a);
	}
	static int lcm(int a, int b) {
		return (a * b) / gcd(a, b);
	}
	static int least_mul_of_Ts(const std::vector<int>& T_at_level) {
		assert(T_at_level.size() > 0);
		int cur_res = T_at_level[0];
		for (int i = 1; i < T_at_level.size(); i++) {
			int cur_v = T_at_level[i] << i;
			cur_res = lcm(cur_v, cur_res);
		}
		return cur_res;
	}

	std::vector<Match> Detector::match(Mat& source, float threshold, const std::vector<string>& class_ids, const Mat& mask)
	{
		dx_ = cv::Mat();
		dy_ = cv::Mat();

		std::vector<Match> matches;

		// --------- fusion version of response map creation

		// results we want
		LinearMemoryPyramid lm_pyramid(pyramid_levels, std::vector<LinearMemories>(1, LinearMemories(8)));
		std::vector<Size> sizes;

		assert(mask.empty() && "mask not support yet");

		// no need to crop now, we deal with it internally
		const int lcm_Ts = least_mul_of_Ts(T_at_level);
		const int biggest_imgRows = source.rows / lcm_Ts * lcm_Ts;
		const int biggest_imgCols = source.cols / lcm_Ts * lcm_Ts;

		const int tileRows = 32;
		const int tileCols = 256;
		const int num_threads_ = omp_get_max_threads();//线程数

		const int32_t mag_thresh_l2 = int32_t(res_map_mag_thresh * res_map_mag_thresh);

		cv::Mat pyrdown_src;
		for (int cur_l = 0; cur_l < T_at_level.size(); cur_l++) {
			// timer.reset();
			const bool need_pyr = cur_l < T_at_level.size() - 1;

			const int imgRows = biggest_imgRows >> cur_l;
			const int imgCols = biggest_imgCols >> cur_l;

			const int cur_T = T_at_level[cur_l];
			assert(cur_T % 2 == 0);

			// use old linear function will create those for us

			for (int ori = 0; ori < 8; ori++) {
				lm_pyramid[cur_l][0][ori] = cv::Mat(cur_T * cur_T, imgCols / cur_T * imgRows / cur_T, CV_8U);
			}

			sizes.push_back({ imgCols, imgRows });

			cv::Mat src;
			if (cur_l == 0) src = source;
			else src = pyrdown_src;

			if (need_pyr) pyrdown_src = cv::Mat(imgRows / 2, imgCols / 2, CV_8U);

			simple_fusion::ProcessManager manager(fusion_buffers, tileRows, tileCols);
			manager.set_num_threads(num_threads_);
			if (src.channels() == 3)
				manager.get_nodes().push_back(std::make_shared<simple_fusion::BGR2GRAY_8UC3_8U>());
			manager.get_nodes().push_back(std::make_shared<simple_fusion::Gauss1x5Node_8U_32S_4bit_larger>());
			manager.get_nodes().push_back(std::make_shared<simple_fusion::Gauss5x1withPyrdownNode_32S_16S_4bit_smaller>(
				pyrdown_src, need_pyr));
			manager.get_nodes().push_back(std::make_shared<simple_fusion::Sobel1x3SxxSyxNode_16S_16S>());

			if (set_produce_dxy && cur_l == 0) {
				dx_ = cv::Mat(src.size(), CV_16S, cv::Scalar(0));
				dy_ = cv::Mat(src.size(), CV_16S, cv::Scalar(0));
				manager.get_nodes().push_back(std::make_shared<simple_fusion::Sobel3x1SxySyyNodeWithDxy_16S_16S>(dx_, dy_));
			}
			else {
				manager.get_nodes().push_back(std::make_shared<simple_fusion::Sobel3x1SxySyyNode_16S_16S>());
			}

			manager.get_nodes().push_back(std::make_shared<simple_fusion::MagPhaseQuant1x1Node_16S_8U>(mag_thresh_l2));
			manager.get_nodes().push_back(std::make_shared<simple_fusion::Hist3x3Node_8U_8U>());
			manager.get_nodes().push_back(std::make_shared<simple_fusion::Spread1xnNode_8U_8U>(cur_T + 1));
			manager.get_nodes().push_back(std::make_shared<simple_fusion::Spreadnx1Node_8U_8U>(cur_T + 1));
			manager.get_nodes().push_back(std::make_shared<simple_fusion::Response1x1Node_8U_8U>());
			manager.get_nodes().push_back(std::make_shared<simple_fusion::LinearizeTxTNode_8U_8U>(cur_T, imgCols,
				lm_pyramid[cur_l][0]));
			manager.arrange(imgRows, imgCols);

			std::vector<cv::Mat> in_v;
			in_v.push_back(src);

			std::vector<cv::Mat> out_v = lm_pyramid[cur_l][0];
			manager.process(in_v, out_v);

		}
		// ----------------------------------------------------------------------
		if (class_ids.empty()) {
			// Match all templates
			TemplatesMap::const_iterator it = class_templates.begin(), itend = class_templates.end();
			for (; it != itend; ++it)
				matchClass(lm_pyramid, sizes, threshold, matches, it->first, it->second);
		}
		else {
			// Match only templates for the requested class IDs
			for (int i = 0; i < (int)class_ids.size(); ++i) {
				TemplatesMap::const_iterator it = class_templates.find(class_ids[i]);
				if (it != class_templates.end())
					matchClass(lm_pyramid, sizes, threshold, matches, it->first, it->second);
			}
		}
		// Sort matches by similarity, and prune any duplicates introduced by pyramid refinement
		std::sort(matches.begin(), matches.end());
		std::vector<Match>::iterator new_end = std::unique(matches.begin(), matches.end());
		matches.erase(new_end, matches.end());
		//time1.out("matches");
		return matches;
	}

	// Used to filter out weak matches
	struct MatchPredicate
	{
		MatchPredicate(float _threshold) : threshold(_threshold) {}
		bool operator()(const Match& m) { return m.similarity < threshold; }
		float threshold;
	};



	void Detector::matchClass(const LinearMemoryPyramid& lm_pyramid,
		const std::vector<Size>& sizes,
		float threshold, std::vector<Match>& matches,
		const std::string& class_id,
		const std::vector<TemplatePyramid>& template_pyramids) const
	{
		int thread_nums = omp_get_max_threads();
		std::vector<vector<Match>>local_matches(thread_nums);
#pragma omp parallel num_threads(thread_nums)
		{
			//  std::vector<vector<Match>>& thread_matches = local_matches[thread_id];
			const int thread_id = omp_get_thread_num();
			vector<Match>& thread_matches = local_matches[thread_id];
#pragma omp for schedule(dynamic,1)
			for (int template_id = 0; template_id < template_pyramids.size(); ++template_id)
			{
				const TemplatePyramid& tp = template_pyramids[template_id];
				// First match over the whole image at the lowest pyramid level
				/// @todo Factor this out into separate function
				const std::vector<LinearMemories>& lowest_lm = lm_pyramid.back();

				std::vector<Match> candidates;
				{
					// Compute similarity maps for each ColorGradient at lowest pyramid level
					Mat similarities;
					int lowest_start = static_cast<int>(tp.size() - 1);
					int lowest_T = T_at_level.back();
					int num_features = 0;

					{
						const Template& templ = tp[lowest_start];
						num_features += static_cast<int>(templ.features.size());

						if (templ.features.size() < 64) {
							similarity_64(lowest_lm[0], templ, similarities, sizes.back(), lowest_T);
							similarities.convertTo(similarities, CV_16U);
						}
						else if (templ.features.size() < 8192) {
							similarity(lowest_lm[0], templ, similarities, sizes.back(), lowest_T);
						}
						else {
							CV_Error(Error::StsBadArg, "feature size too large");
						}
					}

					// Find initial matches
					for (int r = 0; r < similarities.rows; ++r)
					{
						ushort* row = similarities.ptr<ushort>(r);
						for (int c = 0; c < similarities.cols; ++c)
						{
							int raw_score = row[c];
							float score = (raw_score) * 25.0f / (num_features);

							if (score > threshold)
							{
								int offset = /*lowest_T / 2 + */(lowest_T % 2 - 1); // spread has no offset now
								int x = c * lowest_T + offset;
								int y = r * lowest_T + offset;
								candidates.emplace_back(x, y, score, class_id, static_cast<int>(template_id));
							}
						}
					}
				}
				if (!candidates.empty()) {
					Mat similarities2;
					// Locally refine each match by marching up the pyramid
					//for (int l = pyramid_levels - 2; l >= 0; --l)
					for (int l = pyramid_levels-2 ; l >= 0; --l)
					{

						const std::vector<LinearMemories>& lms = lm_pyramid[l];
						int T = T_at_level[l];
						int start = static_cast<int>(l);
						Size size = sizes[l];
						int border = 8 * T;
						int offset = /*T / 2 +*/ (T % 2 - 1); // spread has no offset now
						int max_x = size.width - tp[start].width - border;
						int max_y = size.height - tp[start].height - border;

						for (int m = 0; m < (int)candidates.size(); ++m)
						{
							Match& match2 = candidates[m];
							int x = match2.x * 2 + 1; /// @todo Support other pyramid distance
							int y = match2.y * 2 + 1;

							// Require 8 (reduced) row/cols to the up/left
							x = std::max(x, border);
							y = std::max(y, border);

							// Require 8 (reduced) row/cols to the down/left, plus the template size
							x = std::min(x, max_x);
							y = std::min(y, max_y);

							// Compute local similarity maps for each ColorGradient
							int numFeatures = 0;

							{
								const Template& templ = tp[start];
								numFeatures += static_cast<int>(templ.features.size());

								if (templ.features.size() < 64) {
									similarityLocal_64(lms[0], templ, similarities2, size, T, Point(x, y));
									similarities2.convertTo(similarities2, CV_16U);
								}
								else if (templ.features.size() < 8192) {
									similarityLocal(lms[0], templ, similarities2, size, T, Point(x, y));
								}
								else {
									CV_Error(Error::StsBadArg, "feature size too large");
								}
							}

							// Find best local adjustment
							float best_score = 0;
							int best_r = -1, best_c = -1;
							for (int r = 0; r < similarities2.rows; ++r)
							{
								ushort* row = similarities2.ptr<ushort>(r);
								for (int c = 0; c < similarities2.cols; ++c)
								{
									int score_int = row[c];
									float score = (score_int * 25.0f) / (numFeatures);

									if (score > best_score)
									{
										best_score = score;
										best_r = r;
										best_c = c;
									}
								}
							}
							// Update current match
							match2.similarity = best_score;
							match2.x = (x / T - 8 + best_c) * T + offset;
							match2.y = (y / T - 8 + best_r) * T + offset;
						}

						// Filter out any matches that drop below the similarity threshold
						std::vector<Match>::iterator new_end = std::remove_if(candidates.begin(), candidates.end(),
							MatchPredicate(threshold));
						candidates.erase(new_end, candidates.end());
					}
				}
				// thread_matches.emplace_back(candidates);
				if (!candidates.empty()) {
					thread_matches.insert(thread_matches.end(), candidates.begin(), candidates.end());
				}
			}
		}
		size_t total_matches = 0;
		for (const auto& vec : local_matches) total_matches += vec.size();
		matches.reserve(matches.size() + total_matches);
		for (auto& vec : local_matches) {
			matches.insert(matches.end(), std::make_move_iterator(vec.begin()),
				std::make_move_iterator(vec.end()));
		}
	}
//void Detector::matchClass(const LinearMemoryPyramid& lm_pyramid,
//	const std::vector<Size>& sizes,
//	float threshold, std::vector<Match>& matches,
//	const std::string& class_id,
//	const std::vector<TemplatePyramid>& template_pyramids) const
//{
//	int thread_nums = omp_get_max_threads();
//	//int thread_nums = 1;
//	//std::vector<std::vector<std::vector<Match>>> local_matches(omp_get_max_threads());
//	//std::vector<std::vector<Match>> local_matches(thread_nums);
//#pragma omp parallel num_threads(thread_nums)
//	{
//
//		//  std::vector<vector<Match>>& thread_matches = local_matches[thread_id];
//		vector<Match> thread_matches;
//#pragma omp for schedule(dynamic,1)
//		for (int template_id = 0; template_id < template_pyramids.size(); ++template_id)
//		{
//			const TemplatePyramid& tp = template_pyramids[template_id];
//			// First match over the whole image at the lowest pyramid level
//			/// @todo Factor this out into separate function
//			const std::vector<LinearMemories>& lowest_lm = lm_pyramid.back();
//
//			std::vector<Match> candidates;
//			{
//				// Compute similarity maps for each ColorGradient at lowest pyramid level
//				Mat similarities;
//				int lowest_start = static_cast<int>(tp.size() - 1);
//				int lowest_T = T_at_level.back();
//				int num_features = 0;
//
//				{
//					const Template& templ = tp[lowest_start];
//					num_features += static_cast<int>(templ.features.size());
//
//					if (templ.features.size() < 64) {
//						similarity_64(lowest_lm[0], templ, similarities, sizes.back(), lowest_T);
//						similarities.convertTo(similarities, CV_16U);
//					}
//					else if (templ.features.size() < 8192) {
//						similarity(lowest_lm[0], templ, similarities, sizes.back(), lowest_T);
//					}
//					else {
//						CV_Error(Error::StsBadArg, "feature size too large");
//					}
//				}
//
//				// Find initial matches
//				for (int r = 0; r < similarities.rows; ++r)
//				{
//					ushort* row = similarities.ptr<ushort>(r);
//					for (int c = 0; c < similarities.cols; ++c)
//					{
//						int raw_score = row[c];
//						float score = (raw_score) * 25.0f / (num_features);
//
//						if (score > threshold)
//						{
//							int offset = /*lowest_T / 2 + */(lowest_T % 2 - 1); // spread has no offset now
//							int x = c * lowest_T + offset;
//							int y = r * lowest_T + offset;
//							candidates.emplace_back(x, y, score, class_id, static_cast<int>(template_id));
//						}
//					}
//				}
//			}
//			if (!candidates.empty()) {
//				// Locally refine each match by marching up the pyramid
//				for (int l = pyramid_levels - 2; l >= 0; --l)
//				{
//
//					const std::vector<LinearMemories>& lms = lm_pyramid[l];
//					int T = T_at_level[l];
//					int start = static_cast<int>(l);
//					Size size = sizes[l];
//					int border = 8 * T;
//					int offset = /*T / 2 +*/ (T % 2 - 1); // spread has no offset now
//					int max_x = size.width - tp[start].width - border;
//					int max_y = size.height - tp[start].height - border;
//
//					Mat similarities2;
//					for (int m = 0; m < (int)candidates.size(); ++m)
//					{
//						Match& match2 = candidates[m];
//						int x = match2.x * 2 + 1; /// @todo Support other pyramid distance
//						int y = match2.y * 2 + 1;
//
//						// Require 8 (reduced) row/cols to the up/left
//						x = std::max(x, border);
//						y = std::max(y, border);
//
//						// Require 8 (reduced) row/cols to the down/left, plus the template size
//						x = std::min(x, max_x);
//						y = std::min(y, max_y);
//
//						// Compute local similarity maps for each ColorGradient
//						int numFeatures = 0;
//
//						{
//							const Template& templ = tp[start];
//							numFeatures += static_cast<int>(templ.features.size());
//
//							if (templ.features.size() < 64) {
//								similarityLocal_64(lms[0], templ, similarities2, size, T, Point(x, y));
//								similarities2.convertTo(similarities2, CV_16U);
//							}
//							else if (templ.features.size() < 8192) {
//								similarityLocal(lms[0], templ, similarities2, size, T, Point(x, y));
//							}
//							else {
//								CV_Error(Error::StsBadArg, "feature size too large");
//							}
//						}
//
//						// Find best local adjustment
//						float best_score = 0;
//						int best_r = -1, best_c = -1;
//						for (int r = 0; r < similarities2.rows; ++r)
//						{
//							ushort* row = similarities2.ptr<ushort>(r);
//							for (int c = 0; c < similarities2.cols; ++c)
//							{
//								int score_int = row[c];
//								float score = (score_int * 25.0f) / (numFeatures);
//
//								if (score > best_score)
//								{
//									best_score = score;
//									best_r = r;
//									best_c = c;
//								}
//							}
//						}
//						// Update current match
//						match2.similarity = best_score;
//						match2.x = (x / T - 8 + best_c) * T + offset;
//						match2.y = (y / T - 8 + best_r) * T + offset;
//					}
//
//					// Filter out any matches that drop below the similarity threshold
//					std::vector<Match>::iterator new_end = std::remove_if(candidates.begin(), candidates.end(),
//						MatchPredicate(threshold));
//					candidates.erase(new_end, candidates.end());
//				}
//				// thread_matches.emplace_back(candidates);
//				if (!candidates.empty()) {
//					thread_matches.insert(thread_matches.end(), candidates.begin(), candidates.end());
//				}
//			}
//			//thread_matches.emplace_back(candidates);
//			//local_matches[template_id] = std::move(candidates);
//		}
//#pragma omp critical
//		{
//			matches.insert(matches.end(), thread_matches.begin(), thread_matches.end());
//		}
//	}
//	//for (int i = 0; i < local_matches.size(); i++) {
//	 //   for (int j = 0; j < local_matches[i].size(); j++) {
//			// matches.insert(matches.end(), local_matches[i][j].begin(), local_matches[i][j].end());
//	 //       matches.emplace_back(local_matches[i][j]);
//	  //  }
//	//}
//}


	int Detector::addTemplate(const Mat source, const std::string& class_id,
		const Mat& object_mask, int num_features)
	{
		std::vector<TemplatePyramid>& template_pyramids = class_templates[class_id];
		int template_id = static_cast<int>(template_pyramids.size());

		TemplatePyramid tp;
		tp.resize(pyramid_levels);

		{
			// Extract a template at each pyramid level
			Ptr<ColorGradientPyramid> qp = modality->process(source, object_mask);

			if (num_features > 0)
				qp->num_features = num_features;

			for (int l = 0; l < pyramid_levels; ++l)
			{
				/// @todo Could do mask subsampling here instead of in pyrDown()
				if (l > 0)
					qp->pyrDown();

				bool success = qp->extractTemplate(tp[l]);
				if (!success)
					return -1;
			}
		}

		//    Rect bb =
		cropTemplates(tp);

		/// @todo Can probably avoid a copy of tp here with swap
		template_pyramids.push_back(tp);
		return template_id;
	}
	static cv::Point2f rotate2d(const cv::Point2f inPoint, const double angRad)
	{
		cv::Point2f outPoint;
		//CW rotation
		outPoint.x = std::cos(angRad) * inPoint.x - std::sin(angRad) * inPoint.y;
		outPoint.y = std::sin(angRad) * inPoint.x + std::cos(angRad) * inPoint.y;
		return outPoint;
	}

	static cv::Point2f rotateAndScale2d(const cv::Point2f inPoint, const double angRad, const float scale)
	{
		cv::Point2f outPoint;
		//CW rotation
		outPoint.x = scale * (std::cos(angRad) * inPoint.x - std::sin(angRad) * inPoint.y);
		outPoint.y = scale * (std::sin(angRad) * inPoint.x + std::cos(angRad) * inPoint.y);
		return outPoint;
	}
	static cv::Point2f rotateAndScalePoint(const cv::Point2f inPoint, const cv::Point2f center, const double angRad, const float scale)
	{
		return rotateAndScale2d(inPoint - center, angRad, scale) + center;
	}
	static cv::Point2f rotatePoint(const cv::Point2f inPoint, const cv::Point2f center, const double angRad)
	{
		return rotate2d(inPoint - center, angRad) + center;
	}

	void Detector::addTemplates(const string& class_id, shape_based_matching::shapeInfo_producer& shapes, int num_features)
	{

		TemplatePyramid base_tp;
		base_tp.resize(pyramid_levels);
		{
			// Extract a template at each pyramid level
			Ptr<ColorGradientPyramid> qp = modality->process(shapes.src_of_base(), shapes.mask_of_base());
			if (num_features > 0)
				qp->num_features = num_features;

			for (int l = 0; l < pyramid_levels; ++l)
			{
				/// @todo Could do mask subsampling here instead of in pyrDown()
				if (l > 0)
					qp->pyrDown();

				bool success = qp->extractTemplate(base_tp[l]);
				if (!success)
				{
					//特征点选取失败时 自降金字塔
					//
					//std::cout << "create failed" << std::endl;
					//return ;
					if (l == 0) {
						pyramid_levels = 1;
						base_tp.resize(1);
						base_tp.shrink_to_fit();
						T_at_level.resize(1);
						T_at_level.shrink_to_fit();
						std::cout << "adaptive_pyramid_levels to :" << pyramid_levels << std::endl;
						break;
					}
					else {
						pyramid_levels = l;
						base_tp.resize(l);
						base_tp.shrink_to_fit();
						T_at_level.resize(l);
						T_at_level.shrink_to_fit();
						std::cout << "adaptive_pyramid_levels to :" << pyramid_levels << std::endl;
					}
					break;
				}
			}
		}
		//加入基础模板 为了显示特征点
		baseTemplate = base_tp[0];
		baseTemplate.width = shapes.src.cols;
		baseTemplate.height = shapes.src.rows;
		cropTemplates(base_tp);



		//std::cout << baseTemplate.width << std::endl;
		/// @todo Can probably avoid a copy of tp here with swap

		std::vector<TemplatePyramid>& template_pyramids = class_templates[class_id];
		cv::Point2f center = { (shapes.src.cols - 1) / 2.0f,(shapes.src.rows - 1) / 2.0f };
		int n = template_pyramids.size();
		template_pyramids.resize(n + shapes.infos.size());
#pragma omp parallel for 
		for (int i = 0; i < shapes.infos.size(); i++) {
			cv::Point2f temp_center = center;
			auto& info = shapes.infos[i];
			float theta = info.angle;
			const auto& to_rotate_tp = base_tp;
			TemplatePyramid tp;
			tp.resize(pyramid_levels);
			for (int l = 0; l < pyramid_levels; ++l)
			{
				if (l > 0) { temp_center /= 2.0; }
				int m = tp[l].features.size(); //faseter than push_back
				tp[l].features.resize(m + to_rotate_tp[l].features.size());
				for (int j = 0; j < to_rotate_tp[l].features.size(); j++) {
					const auto& f = to_rotate_tp[l].features[j];
					Point2f p;
					p.x = f.x + to_rotate_tp[l].tl_x;
					p.y = f.y + to_rotate_tp[l].tl_y;
					// Point2f p_rot = rotateAndScalePoint(p, temp_center, -theta / 180 * CV_PI,info.scale);
					Point2f p_rot = rotateAndScalePoint(p, temp_center, -theta / 180 * CV_PI, info.scale);
					Feature f_new;
					f_new.x = int(round(p_rot.x));
					f_new.y = int(round(p_rot.y));
					f_new.theta = f.theta - theta;
					while (f_new.theta > 360) f_new.theta -= 360;
					while (f_new.theta < 0) f_new.theta += 360;

					f_new.label = int(f_new.theta * 16 / 360 + 0.5f);
					f_new.label &= 7;
					// tp[l].features.emplace_back(f_new);
					tp[l].features[m + j] = std::move(f_new);
				}
				tp[l].pyramid_level = l;
			}
			cropTemplates(tp);
			template_pyramids[n + i] = std::move(tp);
		}
		//template_pyramids.emplace_back(base_tp);
	}

	int Detector::addTemplate_rotate(const string& class_id, int zero_id,
		float theta, cv::Point2f center)
	{
		std::vector<TemplatePyramid>& template_pyramids = class_templates[class_id];
		int template_id = static_cast<int>(template_pyramids.size());

		const auto& to_rotate_tp = template_pyramids[zero_id];

		TemplatePyramid tp;
		tp.resize(pyramid_levels);

		for (int l = 0; l < pyramid_levels; ++l)
		{
			if (l > 0) center /= 2.0;

			for (auto& f : to_rotate_tp[l].features) {
				Point2f p;
				p.x = f.x + to_rotate_tp[l].tl_x;
				p.y = f.y + to_rotate_tp[l].tl_y;
				Point2f p_rot = rotatePoint(p, center, -theta / 180 * CV_PI);

				Feature f_new;
				f_new.x = int(std::round(p_rot.x));
				f_new.y = int(std::round(p_rot.y));

				f_new.theta = f.theta - theta;
				while (f_new.theta > 360) f_new.theta -= 360;
				while (f_new.theta < 0) f_new.theta += 360;

				f_new.label = int(f_new.theta * 16 / 360 + 0.5f);
				f_new.label &= 7;


				tp[l].features.push_back(f_new);
			}
			tp[l].pyramid_level = l;
		}

		cropTemplates(tp);

		template_pyramids.push_back(tp);
		return template_id;
	}
	const std::vector<Template>& Detector::getTemplates(const std::string& class_id, int template_id) const
	{
		TemplatesMap::const_iterator i = class_templates.find(class_id);
		CV_Assert(i != class_templates.end());
		CV_Assert(i->second.size() > size_t(template_id));
		return i->second[template_id];
	}

	int Detector::numTemplates() const
	{
		int ret = 0;
		TemplatesMap::const_iterator i = class_templates.begin(), iend = class_templates.end();
		for (; i != iend; ++i)
			ret += static_cast<int>(i->second.size());
		return ret;
	}

	int Detector::numTemplates(const std::string& class_id) const
	{
		TemplatesMap::const_iterator i = class_templates.find(class_id);
		if (i == class_templates.end())
			return 0;
		return static_cast<int>(i->second.size());
	}

	std::vector<std::string> Detector::classIds() const
	{
		std::vector<std::string> ids;
		TemplatesMap::const_iterator i = class_templates.begin(), iend = class_templates.end();
		for (; i != iend; ++i)
		{
			ids.push_back(i->first);
		}

		return ids;
	}

	void Detector::read(const FileNode& fn)
	{
		class_templates.clear();
		pyramid_levels = fn["pyramid_levels"];
		fn["T"] >> T_at_level;

		modality = makePtr<ColorGradient>();
	}

	void Detector::write(FileStorage& fs) const
	{
		fs << "pyramid_levels" << pyramid_levels;
		fs << "T" << T_at_level;

		modality->write(fs);
	}

	std::string Detector::readClass(const FileNode& fn, const std::string& class_id_override)
	{
		// Detector should not already have this class
		String class_id;
		if (class_id_override.empty())
		{
			String class_id_tmp = fn["class_id"];
			CV_Assert(class_templates.find(class_id_tmp) == class_templates.end());
			class_id = class_id_tmp;
		}
		else
		{
			class_id = class_id_override;
		}

		TemplatesMap::value_type v(class_id, std::vector<TemplatePyramid>());
		std::vector<TemplatePyramid>& tps = v.second;
		int expected_id = 0;

		FileNode tps_fn = fn["template_pyramids"];
		tps.resize(tps_fn.size());
		FileNodeIterator tps_it = tps_fn.begin(), tps_it_end = tps_fn.end();
		for (; tps_it != tps_it_end; ++tps_it, ++expected_id)
		{
			int template_id = (*tps_it)["template_id"];
			CV_Assert(template_id == expected_id);
			FileNode templates_fn = (*tps_it)["templates"];
			tps[template_id].resize(templates_fn.size());

			FileNodeIterator templ_it = templates_fn.begin(), templ_it_end = templates_fn.end();
			int idx = 0;
			for (; templ_it != templ_it_end; ++templ_it)
			{
				tps[template_id][idx++].read(*templ_it);
			}
		}

		class_templates.insert(v);
		return class_id;
	}

	void Detector::writeClass(const std::string& class_id, FileStorage& fs) const
	{
		TemplatesMap::const_iterator it = class_templates.find(class_id);
		CV_Assert(it != class_templates.end());
		const std::vector<TemplatePyramid>& tps = it->second;

		fs << "class_id" << it->first;
		fs << "pyramid_levels" << pyramid_levels;
		fs << "template_pyramids"
			<< "[";
		for (size_t i = 0; i < tps.size(); ++i)
		{
			const TemplatePyramid& tp = tps[i];
			fs << "{";
			fs << "template_id" << int(i); //TODO is this cast correct? won't be good if rolls over...
			fs << "templates"
				<< "[";
			for (size_t j = 0; j < tp.size(); ++j)
			{
				fs << "{";
				tp[j].write(fs);
				fs << "}"; // current template
			}
			fs << "]"; // templates
			fs << "}"; // current pyramid
		}
		fs << "]"; // pyramids
	}

	void Detector::readClasses(const std::vector<std::string>& class_ids,
		const std::string& format)
	{
		for (size_t i = 0; i < class_ids.size(); ++i)
		{
			const String& class_id = class_ids[i];
			String filename = cv::format(format.c_str(), class_id.c_str());
			FileStorage fs(filename, FileStorage::READ);
			readClass(fs.root());
		}
	}

	void Detector::writeClasses(const std::string& format) const
	{
		TemplatesMap::const_iterator it = class_templates.begin(), it_end = class_templates.end();
		for (; it != it_end; ++it)
		{
			const String& class_id = it->first;
			String filename = cv::format(format.c_str(), class_id.c_str());
			FileStorage fs(filename, FileStorage::WRITE);
			writeClass(class_id, fs);
		}
	}

} // namespace line2Dup
